In [82]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('bmh')
%matplotlib inline
In [83]:
# To learn more about the data set https://archive.ics.uci.edu/ml/datasets/pima+indians+diabetes
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'
df = pd.read_csv(data_url)
In [84]:
df.head()
Out[84]:
In [85]:
columns = ["#pregnancies", "glucose_conc", "blood_pressure",
"skin_thickness", "serum_insulin", "bmi", "dpf", "age", "class"]
df.columns = columns
In [86]:
df.head()
Out[86]:
In [87]:
df.shape
Out[87]:
In [88]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
In [89]:
# Split data into a training and testing datasets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
In [90]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
In [91]:
# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_train_score = lr.score(X_train, y_train)
lr_test_score = lr.score(X_test, y_test)
print("Accuracy of training score is", train_score)
print("Accuracy of testing score is", test_score )
In [92]:
# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_train_score = rf.score(X_train, y_train)
rf_test_score = rf.score(X_test, y_test)
print("Accuracy of training score is", rf_train_score)
print("Accuracy of testing score is", rf_test_score)
In [93]:
# Naive Bayse Classifier
nb = MultinomialNB()
nb.fit(X_train, y_train)
nb_train_score = nb.score(X_train, y_train)
nb_test_score = nb.score(X_test, y_test)
print("Accuracy of training score is", nb_train_score)
print("Accuracy of testing score is", nb_test_score)
In [94]:
# Support Vector Machines
svm = SVC()
svm.fit(X_train, y_train)
svm_train_score = svm.score(X_train, y_train)
svm_test_score = svm.score(X_test, y_test)
print("Accuracy of training score is", svm_train_score)
print("Accuracy of testing score is", svm_test_score)
In [95]:
# Plotting the results
classifiers = ["Logistic_Reg", "Random_Forest", "Naive_Bayes", "Support_Vector"]
y_axis = range(len(classifiers))
scores = [lr_test_score, rf_test_score, nb_test_score, svm_test_score]
plt.bar(y_axis, scores, align='center', alpha=0.5)
plt.xticks(y_axis, classifiers)
plt.ylabel('Testing score')
plt.title('Comparison of ML classifiers')
Out[95]:
In [96]:
from sklearn.grid_search import GridSearchCV
In [103]:
grid_values = {
'n_estimators': (5, 10, 20, 50),
'max_depth': (50, 150, 250),
'min_samples_split': [2, 3],
'min_samples_leaf': (1, 2, 3)
}
grid_search = GridSearchCV(rf, param_grid=grid_values, verbose=1, n_jobs=-1, cv=3)
grid_search.fit(X_train, y_train)
Out[103]:
In [104]:
print ('Best score: %0.3f' % grid_search.best_score_)
print ('Best parameters set:')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(grid_values.keys()):
print ('\t%s: %r' % (param_name, best_parameters[param_name]))
In [ ]: